/*******************************************************************************
-------------- Effect of an Intensive Food-as-Medicine Program on --------------
------------------------- Health and Health Care Use: --------------------------
------------------------- A Randomized Clinical Trial --------------------------

-------------- Joseph J. Doyle, Marcella Alsan, Nicholas Skelley, --------------
-------------------------- Yutong Lu, and John Cawley --------------------------

This .do file replicates results that appear in Tables 1-4 in the article,
as published in JAMA Internal Medicine.

Last Updated: 28 Dec. 2023 (NS)
*******************************************************************************/

capture log close
insheet using "analysis_sample.csv", comma clear

tempfile analysis_sample
save `analysis_sample', replace

log using "output/published_tables.log", text replace

*------------------------------------------------------------------------------*
*-------------- Table 1. Characteristics of Patients at Baseline --------------*

// Construct additional race/ethnicity and timing variables
gen race_nonhisp_white = ((hispanic == 0) & (race == "White"))
gen race_nonhisp_other = ((hispanic == 0) & (race != "White") & ///
    (strmatch(race, "Black Or*") == 0))
gen days_between_tests = (hba1c_date_6mo - hba1c_date_0mo)

// Separate variables that require means (intentionally numeric variables) from
// those that should be counts and percentages (indicator/dummy variables)
local baseline_means hba1c_val_0mo age days_between_tests
local baseline_counts male hispanic race_nonhisp_black race_nonhisp_white ///
    race_nonhisp_other rural ip_i_0mo ed_i_0mo drug_metformin_0mo ///
    drug_insulin_0mo drug_glp1_0mo drug_other_0mo drug_any_db_summary_0mo

gen index = _n

// For organizational purposes, adding standard prefixes to variables indicating
// what operation(s) should be performed on them
foreach var of varlist `baseline_means' {
    gen MEAN_`var' = `var'
    gen SD_`var' = `var'
}
foreach var of varlist `baseline_counts' {
    gen N_`var' = `var'
}

// Will use this data as-is again
tempfile baseline_characteristics
save `baseline_characteristics', replace

// Program with the steps to save CSV output from each "column"
program define clean_save_t1
    version 16
    args save_as
    // Construct percentage variable for indicators
    foreach var of varlist N_* {
        gen PCT_`var' = (`var' / N) * 100
    }
    rename PCT_N_* PCT_*

    // Transpose so that each variable is a row (easier to read and compare at 
    // once)
    xpose, clear varname
    
    // Rename variables (from v1, v2, _varname) to be descriptive
    rename _varname variable
    // Check if v2 column exists (if exists, then treatment and control columns 
    // exist)
    capture confirm variable v2, exact
    if !_rc {
        // v2 exists
        rename v1 control
        rename v2 treatment
    }
    else {
        // v2 DNE
        rename v1 overall
    }

    // Export as CSV
    outsheet using `save_as', comma replace
end

*------------------------------------------------------------------------------*
// Table 1 - Column 1 (full sample, overall)

// Generate all necessary statistics
collapse (count) N=index (mean) MEAN_* (sd) SD_* (sum) N_*

clean_save_t1 "output/T1_fullsample_baseline_overall.csv"

*------------------------------------------------------------------------------*
// Table 1 - Columns 2 and 3 (full sample, by treatment status)

// Get prepared data again
use `baseline_characteristics', clear

collapse (count) N=index (mean) MEAN_* (sd) SD_* (sum) N_*, by(treat_now)

clean_save_t1 "output/T1_fullsample_baseline_grouped.csv"

*------------------------------------------------------------------------------*
// Table 1 - Column 4 (lab sample, overall)

use `baseline_characteristics', clear

// Restrict to lab sample (non-missing HbA1c at 6 months)
keep if !mi(hba1c_val_6mo)
collapse (count) N=index (mean) MEAN_* (sd) SD_* (sum) N_*

clean_save_t1 "output/T1_labsample_baseline_overall.csv"

*------------------------------------------------------------------------------*
// Table 1 - Columns 5 and 6 (lab sample, by treatment status)

use `baseline_characteristics', clear

// Restrict to lab sample (non-missing HbA1c at 6 months)
keep if !mi(hba1c_val_6mo)
collapse (count) N=index (mean) MEAN_* (sd) SD_* (sum) N_*, by(treat_now)

clean_save_t1 "output/T1_labsample_baseline_grouped.csv"
*------------------------------------------------------------------------------*


*------------------------------------------------------------------------------*
*-------------- Table 2. Program Engagement and Goal Attainment ---------------*

use `analysis_sample', clear

*------------------------------------------------------------------------------*
// Table 2 - Columns 1 and 2 (6-month mean and SD for treatment and control)
local t2_means months_with_visits_6mo site_visits_6mo dietician_visits_n_6mo ///
    dsmt_visits_6mo endocrinology_visits_n_6mo weight_visits_n_6mo fruit_num ///
    vegetables_num sweet_drink_num diet_improve_index engagement_index ///
    diabetes_knowledge_score healthy_attitudes_index self_efficacy_index ///
    exercise_smoking_index
local t2_counts never_eat_out_yn diet_improved_yn

foreach var of varlist `t2_means' {
    gen MEAN_`var' = `var'
    gen SD_`var' = `var'
}
foreach var of varlist `t2_counts' {
    gen YES_`var' = `var'
    gen TOTAL_`var' = `var'
}

collapse (mean) MEAN_* (sd) SD_* (sum) YES_* (count) TOTAL_*, by(treat_now)

// Transpose data so that variables are rows
xpose, clear varname
// Rename new columns (v1, v2, _varname) to be descriptive
rename v1 control
rename v2 treatment
rename _varname variable

// Save CSV
outsheet using "output/T2_6mo_summary.csv", comma replace

*------------------------------------------------------------------------------*
// Table 2 - Column 3 (adjusted mean difference and 95% CI)
clear all
use `analysis_sample', clear

// Adjusted differences and CIs for measures of diabetes management services
// (EHR outcomes) are computed with a heteroskedasticity-robust, longitudinal,
// fixed effects model that includes controls for a flexible baseline and
// prespecified control variables (rural site, baseline HbA1c category, 5-year
// age bins, and indicators for being male, non-Hispanic Black, and Hispanic).
local ehr_outcomes "dietician_visits_n dsmt_visits endocrinology_visits_n weight_visits_n"

// Adjusted differences and CIs for all other measures in Table 2 come from a
// heteroskedasticity-robust, cross-sectional, fixed effects model that includes
// pre-specified controls.
local t2_xs_outcomes fruit_num vegetables_num sweet_drink_num ///
    never_eat_out_yn diet_improved_yn diet_improve_index engagement_index ///
    diabetes_knowledge_score healthy_attitudes_index self_efficacy_index ///
    exercise_smoking_index

// Set up 5-year age bins so that excluded group is the top bin (72+)
unab agebins: age5_*
unab ageomit: age5_10
local agebins: list agebins - ageomit

// Create frame to store estimates and CIs
frame create adjusted_diffs
frame change adjusted_diffs
set obs 0
gen variable = ""
gen adj_diff = .
gen ci_low = .
gen ci_high = .
frame change default
// Number of rows in the (new) adjusted_diffs frame
local out_rows 0

// Cross-sectional outcomes
foreach var of varlist `t2_xs_outcomes' {
    regress `var' treat_now rural hba1c_cat_control `agebins' male ///
        race_nonhisp_black hispanic, robust
    quietly {
        local out_rows = `out_rows' + 1
        frame change adjusted_diffs
        set obs `out_rows'
        replace variable = "`var'" if _n == `out_rows'
        replace adj_diff = r(table)["b", "treat_now"] if _n == `out_rows'
        replace ci_low = r(table)["ll", "treat_now"] if _n == `out_rows'
        replace ci_high = r(table)["ul", "treat_now"] if _n == `out_rows'
        frame change default
    }
}

// EHR outcomes
foreach var of local ehr_outcomes {
    use `analysis_sample', clear

    // Preserve baseline value before making longitudinal
    gen `var'_baseline = `var'_0mo

    // Outcomes are at 6 months; drop 12-month column
    drop `var'_12mo
    // Keep only subjects who have valid 6-month values
    drop if mi(`var'_6mo)

    // Reshape to be longitudinal
    reshape long `var'_@mo, i(subject_id) j(months)
    // Clean up after transformation
    rename `var'_mo `var'_panel

    // Indicator for the value we'd look at in a cross-sectional model (value at
    // 6 months)
    gen post_period = (months == 6)

    // Interaction terms:
    // Variable of interest: 1[6-month outcome] * 1[treatment group]
    gen post_treat_now = post_period * treat_now
    // Flexible baseline: 1[6-month outcome] * baseline value
    gen post_baseline = post_period * `var'_baseline

    // Unique subject identifier
    egen n_subject_id = max(_n), by(subject_id)

    // Confirm longitudinal form suitable for xtreg
    xtset n_subject_id post_period

    xtreg `var'_panel post_period treat_now post_treat_now post_baseline ///
        rural hba1c_cat_control `agebins' male race_nonhisp_black hispanic, ///
        fe i(n_subject_id) robust

    // Store estimates in the adjusted_diffs frame
    quietly {
        local out_rows = `out_rows' + 1
        frame change adjusted_diffs
        set obs `out_rows'
        replace variable = "`var'" if _n == `out_rows'
        replace adj_diff = r(table)["b", "post_treat_now"] if _n == `out_rows'
        replace ci_low = r(table)["ll", "post_treat_now"] if _n == `out_rows'
        replace ci_high = r(table)["ul", "post_treat_now"] if _n == `out_rows'
        frame change default
    }
}


// Save regression results as CSV
frame adjusted_diffs: outsheet using "output/T2_6mo_adjusted_diffs.csv", ///
    comma replace

*------------------------------------------------------------------------------*
// Table 2 - Columns 4 and 5 (12-month mean and SD for treatment and control,
//           Program-tracking measures only)
use `analysis_sample', clear

collapse (mean) MEAN_site_visits_12mo=site_visits_12mo ///
    MEAN_months_with_visits_12mo=months_with_visits_12mo ///
    (sd) SD_site_visits_12mo=site_visits_12mo ///
    SD_months_with_visits_12mo=months_with_visits_12mo, by(treat_now)

xpose, clear varname
rename v1 control
rename v2 treatment
rename _varname variable

outsheet using "output/T2_12mo_program_tracking_means.csv", comma replace
*------------------------------------------------------------------------------*


*------------------------------------------------------------------------------*
*------------- Table 3. Effects of Program on Biometric Outcomes --------------*
// All biometric outcomes use longitudinal, fixed-effects models with a flexible
// baseline control and controls for pre-specified variables (see above).
clear all

use `analysis_sample', clear

// Set up 5-year age bins so that excluded group is the top bin (72+)
unab agebins: age5_*
unab ageomit: age5_10
local agebins: list agebins - ageomit

// Create frame for storing estimates
frame create table3
frame change table3
gen biometric = ""
foreach name in "month" "ctl_mean" "ctl_sd" "trt_mean" "trt_sd" "unadj_diff" ///
    "unadj_ci_low" "unadj_ci_high" "adj_diff" "adj_ci_low" "adj_ci_high" {
    gen `name' = .
}
frame change default
local table3_rows 0

foreach month in 6 12 {
    foreach var in "hba1c" "ldl" "hdl" "total_chol" "trigly" "weight" ///
        "sys" "dias" {
        use `analysis_sample', clear

        // Add a new row to the table3 frame
        local table3_rows = `table3_rows' + 1
        frame table3: set obs `table3_rows'

        // Fill in mean/SD values
        quietly {
            frame change table3
            replace biometric = "`var'" if _n == `table3_rows'
            replace month = `month' if _n == `table3_rows'
            // Control group
            frame default: summarize `var'_val_`month'mo if treat_now == 0
            replace ctl_mean = `r(mean)' if _n == `table3_rows'
            replace ctl_sd = `r(sd)' if _n == `table3_rows'
            // Treatment group
            frame default: summarize `var'_val_`month'mo if treat_now == 1
            replace trt_mean = `r(mean)' if _n == `table3_rows'
            replace trt_sd = `r(sd)' if _n == `table3_rows'
            frame change default
        }
        // Unadjusted difference values
        regress `var'_val_`month'mo treat_now, robust
        
        // Fill in cells of table3 frame
        frame change table3
        replace unadj_diff = r(table)["b", "treat_now"] if _n == `table3_rows'
        replace unadj_ci_low = r(table)["ll", "treat_now"] if ///
            _n == `table3_rows'
        replace unadj_ci_high = r(table)["ul", "treat_now"] if ///
            _n == `table3_rows'
        frame change default

        // Longitudinal model -- Reshape data to be longitudinal
        // Keep only subjects with valid n-month values
        drop if mi(`var'_val_`month'mo)
        // Drop unneeded month column
        unab todrop: `var'_*mo
        unab tokeep: `var'_*_`month'mo `var'_*_0mo
        local todrop: list todrop - tokeep
        drop `todrop'
        // Reshape to be longitudinal
        gen `var'_baseline = `var'_val_0mo
        reshape long `var'_val_@mo `var'_date_@mo, i(subject_id) j(months)
        // Clean up after transformation
        rename `var'_val*mo `var'_panel
        rename `var'_date*mo `var'_time
        replace `var'_time = `var'_time - consent_date

        // Indicators and interactions
        gen post_period = (months == `month')
        gen post_treat_now = post_period * treat_now
        egen n_subject_id = max(_n), by(subject_id)

        // Confirm longitudinal form suitable for xtreg
        xtset n_subject_id post_period
        // Keep only relevant variables
        keep `var'_panel post_period treat_now post_treat_now rural ///
            `var'_baseline `var'_time hba1c_cat_control age5_* male ///
            race_nonhisp_black hispanic n_subject_id

        // Multiple imputation of baseline biometric value and time from consent
        // HbA1c lab needed to qualify for program, so no imputation needed
        if "`var'" != "hba1c" {
            mi set mlong
            mi misstable summarize male `var'_panel
            
            mi register imputed `var'_panel `var'_time

            // Note that due to randomness in imputation, results may vary
            // slightly from those seen in Table 3
            mi impute mvn `var'_panel `var'_time = treat_now post_period ///
                post_treat_now rural hba1c_cat_control male ///
                race_nonhisp_black hispanic age5_*, add(10) rseed(9112023)
            sort n_subject_id post_period
            bysort n_subject_id: replace `var'_baseline = `var'_panel[1] if ///
                mi(`var'_baseline)

            gen post_baseline = post_period * `var'_baseline

            // Longitudinal model -- Regression
            mi estimate, post: xtreg `var'_panel treat_now post_period ///
                post_treat_now post_baseline `var'_time rural ///
                hba1c_cat_control `agebins' male race_nonhisp_black ///
                hispanic, fe i(n_subject_id) robust
        }
        else {
            gen post_baseline = post_period * `var'_baseline
            xtreg `var'_panel post_period treat_now post_treat_now ///
                post_baseline `var'_time rural hba1c_cat_control `agebins' ///
                male race_nonhisp_black hispanic, fe i(n_subject_id) robust
        }
        // Fill in table3 frame's cells
        frame change table3
        replace adj_diff = r(table)["b", "post_treat_now"] if ///
            _n == `table3_rows'
        replace adj_ci_low = r(table)["ll", "post_treat_now"] if ///
            _n == `table3_rows'
        replace adj_ci_high = r(table)["ul", "post_treat_now"] if ///
            _n == `table3_rows'
        frame change default       
    }
}

frame table3: outsheet using "output/T3_biometrics.csv", comma replace
*------------------------------------------------------------------------------*


*------------------------------------------------------------------------------*
*------------- Table 4. Model Comparisons for Use Outcomes --------------*
// Adjusted differences and CIs for all utilization outcomes are computed with
// a heteroskedasticity-robust, longitudinal, fixed effects model that includes 
// controls for a flexible baseline and prespecified control variables (rural 
// site, baseline HbA1c category, 5-year age bins, and indicators for being 
// male, non-Hispanic Black, and Hispanic).
clear all
use `analysis_sample', clear

local ehr_outcomes "ed_or_ip_i op_i ed_or_ip_n op_n ed_or_ip_payment op_payment nf_payment oth_payment total_payment drug_insulin drug_metformin drug_sulf drug_glp1 drug_other drug_any_db"

// Set up 5-year age bins so that excluded group is the top bin (72+)
unab agebins: age5_*
unab ageomit: age5_10
local agebins: list agebins - ageomit

frame create table4_regs
frame change table4_regs
set obs 0
gen variable = ""
gen months = . 
gen adj_diff = .
gen adj_ci_low = .
gen adj_ci_high = .
gen p = .
frame change default
local out_rows = 0

foreach var of local ehr_outcomes {
    foreach month in 6 12 {
        gen MEAN_`var'_`month' = `var'_`month'mo
        gen SD_`var'_`month' = `var'_`month'mo
    }
}
rename MEAN_drug* N_drug*
rename MEAN_*_i_* N_*_i_*
drop SD_drug* SD_*_i_*

gen index = _n
collapse (count) N=index (mean) MEAN_* (sd) SD_* (sum) N_*, by(treat_now)
foreach var of varlist N_* {
    gen PCT_`var' = (`var' / N) * 100
}
rename PCT_N_* PCT_*

xpose, clear varname
rename v1 control
rename v2 treatment
rename _varname variable

outsheet using "output/T4_means.csv", comma replace


// EHR outcomes (regressions)
foreach month in 6 12 {
    foreach var of local ehr_outcomes {
        use `analysis_sample', clear

        // Preserve baseline value before making longitudinal
        gen `var'_baseline = `var'_0mo

        // Keep only subjects who have valid values at relevant month
        drop if mi(`var'_`month'mo)
        // Keep the 0- and n-month columns (drop the excluded month's column)
        unab todrop: `var'_*mo
        unab tokeep: `var'_`month'mo `var'_0mo
        local todrop: list todrop - tokeep
        drop `todrop'

        // Reshape to be longitudinal
        reshape long `var'_@mo, i(subject_id) j(months)
        // Clean up after transformation
        rename `var'_mo `var'_panel

        // Indicator for the value we'd look at in a cross-sectional model (value at
        // 6 months)
        gen post_period = (months == `month')

        // Interaction terms:
        // Variable of interest: 1[6-month outcome] * 1[treatment group]
        gen post_treat_now = post_period * treat_now
        // Flexible baseline: 1[6-month outcome] * baseline value
        gen post_baseline = post_period * `var'_baseline

        // Unique subject identifier
        egen n_subject_id = max(_n), by(subject_id)

        // Confirm longitudinal form suitable for xtreg
        xtset n_subject_id post_period

        xtreg `var'_panel post_period treat_now post_treat_now post_baseline ///
            rural hba1c_cat_control `agebins' male race_nonhisp_black ///
            hispanic, fe i(n_subject_id) robust

        // Store estimates in the table4_regs frame
        quietly {
            local out_rows = `out_rows' + 1
            frame change table4_regs
            set obs `out_rows'
            replace variable = "`var'" if _n == `out_rows'
            replace adj_diff = r(table)["b", "post_treat_now"] if ///
                _n == `out_rows'
            replace adj_ci_low = r(table)["ll", "post_treat_now"] if ///
                _n == `out_rows'
            replace adj_ci_high = r(table)["ul", "post_treat_now"] if ///
                _n == `out_rows'
            replace p = r(table)["pvalue", "post_treat_now"] if _n == `out_rows'
            replace month = `month' if _n == `out_rows'
            frame change default
        }
    }
}

frame table4_regs: outsheet using "output/T4_regressions.csv", comma replace
*------------------------------------------------------------------------------*


log close
clear all